suppressMessages(library(tidyverse))
suppressMessages(library(maps))
suppressMessages(library(lubridate))
suppressMessages(library(viridis))
suppressMessages(library(janitor))
suppressMessages(library(scales))
suppressMessages(library(corrplot))
suppressMessages(library(RColorBrewer))
suppressMessages(library(wesanderson))
suppressMessages(library(htmltools))
suppressMessages(library(webr))
suppressMessages(library(fastDummies))
sales<-read.csv("train.csv")
head(sales,3)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2017-152156 08/11/2017 11/11/2017 Second Class CG-12520
## 2 2 CA-2017-152156 08/11/2017 11/11/2017 Second Class CG-12520
## 3 3 CA-2017-138688 12/06/2017 16/06/2017 Second Class DV-13045
## Customer.Name Segment Country City State Postal.Code
## 1 Claire Gute Consumer United States Henderson Kentucky 42420
## 2 Claire Gute Consumer United States Henderson Kentucky 42420
## 3 Darrin Van Huff Corporate United States Los Angeles California 90036
## Region Product.ID Category Sub.Category
## 1 South FUR-BO-10001798 Furniture Bookcases
## 2 South FUR-CH-10000454 Furniture Chairs
## 3 West OFF-LA-10000240 Office Supplies Labels
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.96
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.94
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.62
dim(sales)
## [1] 9800 18
summary(sales)
## Row.ID Order.ID Order.Date Ship.Date
## Min. : 1 Length:9800 Length:9800 Length:9800
## 1st Qu.:2451 Class :character Class :character Class :character
## Median :4900 Mode :character Mode :character Mode :character
## Mean :4900
## 3rd Qu.:7350
## Max. :9800
##
## Ship.Mode Customer.ID Customer.Name Segment
## Length:9800 Length:9800 Length:9800 Length:9800
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Country City State Postal.Code
## Length:9800 Length:9800 Length:9800 Min. : 1040
## Class :character Class :character Class :character 1st Qu.:23223
## Mode :character Mode :character Mode :character Median :58103
## Mean :55273
## 3rd Qu.:90008
## Max. :99301
## NA's :11
## Region Product.ID Category Sub.Category
## Length:9800 Length:9800 Length:9800 Length:9800
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Product.Name Sales
## Length:9800 Min. : 0.444
## Class :character 1st Qu.: 17.248
## Mode :character Median : 54.490
## Mean : 230.769
## 3rd Qu.: 210.605
## Max. :22638.480
##
str(sales)
## 'data.frame': 9800 obs. of 18 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2017-152156" "CA-2017-152156" "CA-2017-138688" "US-2016-108966" ...
## $ Order.Date : chr "08/11/2017" "08/11/2017" "12/06/2017" "11/10/2016" ...
## $ Ship.Date : chr "11/11/2017" "11/11/2017" "16/06/2017" "18/10/2016" ...
## $ Ship.Mode : chr "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Customer.Name: chr "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
## $ Segment : chr "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Country : chr "United States" "United States" "United States" "United States" ...
## $ City : chr "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
## $ State : chr "Kentucky" "Kentucky" "California" "Florida" ...
## $ Postal.Code : int 42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
## $ Region : chr "South" "South" "West" "South" ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub.Category : chr "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product.Name : chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
colnames(sales)
## [1] "Row.ID" "Order.ID" "Order.Date" "Ship.Date"
## [5] "Ship.Mode" "Customer.ID" "Customer.Name" "Segment"
## [9] "Country" "City" "State" "Postal.Code"
## [13] "Region" "Product.ID" "Category" "Sub.Category"
## [17] "Product.Name" "Sales"
sum(duplicated(sales))
## [1] 0
colSums(is.na(sales))
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode
## 0 0 0 0 0
## Customer.ID Customer.Name Segment Country City
## 0 0 0 0 0
## State Postal.Code Region Product.ID Category
## 0 11 0 0 0
## Sub.Category Product.Name Sales
## 0 0 0
sales[is.na(sales$Postal.Code), ]
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 2235 2235 CA-2018-104066 05/12/2018 10/12/2018 Standard Class QJ-19255
## 5275 5275 CA-2016-162887 07/11/2016 09/11/2016 Second Class SV-20785
## 8799 8799 US-2017-150140 06/04/2017 10/04/2017 Standard Class VM-21685
## 9147 9147 US-2017-165505 23/01/2017 27/01/2017 Standard Class CB-12535
## 9148 9148 US-2017-165505 23/01/2017 27/01/2017 Standard Class CB-12535
## 9149 9149 US-2017-165505 23/01/2017 27/01/2017 Standard Class CB-12535
## 9387 9387 US-2018-127292 19/01/2018 23/01/2018 Standard Class RM-19375
## 9388 9388 US-2018-127292 19/01/2018 23/01/2018 Standard Class RM-19375
## 9389 9389 US-2018-127292 19/01/2018 23/01/2018 Standard Class RM-19375
## 9390 9390 US-2018-127292 19/01/2018 23/01/2018 Standard Class RM-19375
## 9742 9742 CA-2016-117086 08/11/2016 12/11/2016 Standard Class QJ-19255
## Customer.Name Segment Country City State Postal.Code
## 2235 Quincy Jones Corporate United States Burlington Vermont NA
## 5275 Stewart Visinsky Consumer United States Burlington Vermont NA
## 8799 Valerie Mitchum Home Office United States Burlington Vermont NA
## 9147 Claudia Bergmann Corporate United States Burlington Vermont NA
## 9148 Claudia Bergmann Corporate United States Burlington Vermont NA
## 9149 Claudia Bergmann Corporate United States Burlington Vermont NA
## 9387 Raymond Messe Consumer United States Burlington Vermont NA
## 9388 Raymond Messe Consumer United States Burlington Vermont NA
## 9389 Raymond Messe Consumer United States Burlington Vermont NA
## 9390 Raymond Messe Consumer United States Burlington Vermont NA
## 9742 Quincy Jones Corporate United States Burlington Vermont NA
## Region Product.ID Category Sub.Category
## 2235 East TEC-AC-10001013 Technology Accessories
## 5275 East FUR-CH-10000595 Furniture Chairs
## 8799 East TEC-PH-10002555 Technology Phones
## 9147 East TEC-AC-10002926 Technology Accessories
## 9148 East OFF-AR-10003477 Office Supplies Art
## 9149 East OFF-ST-10001526 Office Supplies Storage
## 9387 East OFF-PA-10000157 Office Supplies Paper
## 9388 East OFF-PA-10001970 Office Supplies Paper
## 9389 East OFF-AP-10000828 Office Supplies Appliances
## 9390 East OFF-EN-10001509 Office Supplies Envelopes
## 9742 East FUR-BO-10004834 Furniture Bookcases
## Product.Name Sales
## 2235 Logitech ClearChat Comfort/USB Headset H390 205.03
## 5275 Safco Contoured Stacking Chairs 715.20
## 8799 Nortel Meridian M5316 Digital phone 1294.75
## 9147 Logitech Wireless Marathon Mouse M705 99.98
## 9148 4009 Highlighters 8.04
## 9149 Iceberg Mobile Mega Data/Printer Cart 1564.29
## 9387 Xerox 191 79.92
## 9388 Xerox 1881 12.28
## 9389 Avanti 4.4 Cu. Ft. Refrigerator 542.94
## 9390 Poly String Tie Envelopes 2.04
## 9742 Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish 4404.90
sales$Postal.Code<-replace_na(sales$Postal.Code,05401)
colSums(is.na(sales))
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode
## 0 0 0 0 0
## Customer.ID Customer.Name Segment Country City
## 0 0 0 0 0
## State Postal.Code Region Product.ID Category
## 0 0 0 0 0
## Sub.Category Product.Name Sales
## 0 0 0
sales$Order.Date<-as.Date(sales$Order.Date,'%d/%m/%Y')
sales$Ship.Date<-as.Date(sales$Ship.Date, '%d/%m/%Y')
class(sales$Order.Date)
## [1] "Date"
class(sales$Ship.Date)
## [1] "Date"
cat("Number of unique Order IDs:\n")
## Number of unique Order IDs:
length(unique(sales$Order.ID))
## [1] 4922
cat("Unique Ship Mode:\n")
## Unique Ship Mode:
unique(sales$Ship.Mode)
## [1] "Second Class" "Standard Class" "First Class" "Same Day"
cat("Number of unique Customer IDs:\n")
## Number of unique Customer IDs:
length(unique(sales$Customer.ID))
## [1] 793
cat("Number of unique Customer name:\n")
## Number of unique Customer name:
length(unique(sales$Customer.Name))
## [1] 793
cat("Unique segment:\n")
## Unique segment:
unique(sales$Segment)
## [1] "Consumer" "Corporate" "Home Office"
cat("Unique country:\n")
## Unique country:
unique(sales$Country)
## [1] "United States"
cat("Number of unique Cities:\n")
## Number of unique Cities:
length(unique(sales$City))
## [1] 529
cat("Unique Region:\n")
## Unique Region:
unique(sales$Region)
## [1] "South" "West" "Central" "East"
#Top 5 customers by value
### assuming no customers have same name because length of unique customer.id is same as length of unique customer.name
customer_sales <- sales %>%
group_by(Customer.Name) %>%
summarise(sales=round(sum(Sales),2)) %>%
arrange(desc(sales))
head(customer_sales,5)
## # A tibble: 5 × 2
## Customer.Name sales
## <chr> <dbl>
## 1 Sean Miller 25043.
## 2 Tamara Chand 19052.
## 3 Raymond Buch 15117.
## 4 Tom Ashbrook 14596.
## 5 Adrian Barton 14474.
ggplot(head(customer_sales,10), aes(x=fct_reorder(Customer.Name,sales), y=sales)) +geom_col(width = 0.7, fill='skyblue', col="black") + labs(title = " Top 10 Most Valuable Customers", y="Sales", x="Customer") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 20))+
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)
# mean sales amount
mean(sales$Sales)
## [1] 230.7691
#sales by order.id
sales_by_order<-sales %>%
group_by(Order.ID) %>%
summarise(sales=round(sum(Sales),2)) %>%
arrange(desc(sales))
# mean sales amount by order id
mean(sales_by_order$sales)
## [1] 459.4752
# sales distribution
ggplot(sales_by_order, aes(x=sales)) +
geom_histogram(aes(y=..density..),fill='white', color='black')+
geom_density(fill='skyblue', alpha=0.4,size=0.6) + scale_x_log10() +
labs(title = "Sales Distribution")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# sales date analysis
sales<-sales %>%
mutate(order_year=format(Order.Date,"%Y"),.after=Order.Date)
sales<-sales %>%
mutate(order_month=format(Order.Date,"%b"),.after=order_year)
sales<-sales %>%
mutate(order_day=format(Order.Date,"%a"),.after=order_month)
#Converting month and day to factors for later use
sales$order_month<-factor(sales$order_month,
levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
sales$order_day<-factor(sales$order_day,
levels = c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"))
head(sales,3)
## Row.ID Order.ID Order.Date order_year order_month order_day Ship.Date
## 1 1 CA-2017-152156 2017-11-08 2017 Nov Wed 2017-11-11
## 2 2 CA-2017-152156 2017-11-08 2017 Nov Wed 2017-11-11
## 3 3 CA-2017-138688 2017-06-12 2017 Jun Mon 2017-06-16
## Ship.Mode Customer.ID Customer.Name Segment Country City
## 1 Second Class CG-12520 Claire Gute Consumer United States Henderson
## 2 Second Class CG-12520 Claire Gute Consumer United States Henderson
## 3 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles
## State Postal.Code Region Product.ID Category Sub.Category
## 1 Kentucky 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 Kentucky 42420 South FUR-CH-10000454 Furniture Chairs
## 3 California 90036 West OFF-LA-10000240 Office Supplies Labels
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.96
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.94
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.62
sales_by_day<-sales %>%
group_by(order_day) %>%
summarise(sales=round(sum(Sales),0))
ggplot(sales_by_day, aes(x=order_day,y=sales)) +
geom_col(fill="skyblue", alpha=0.7, width = 0.5, color="black") +
labs(x="",y="Sales", title="Sales by Day") +
theme() + theme(plot.title = element_text(hjust=0.5)) +
scale_y_continuous(labels = comma) +
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)
sales_by_month<-sales %>%
group_by(order_month) %>%
summarise(sales=round(sum(Sales),0))
ggplot(sales_by_month, aes(x=order_month,y=sales)) +
geom_col(fill="skyblue", alpha=0.7, width = 0.7, color="black") +
labs(x="",y="Sales", title="Sales by Month") +
theme(plot.title = element_text(hjust=0.5)) +
scale_y_continuous(labels = comma)+
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)
unique(sales$order_year)
## [1] "2017" "2016" "2015" "2018"
sales$order_year<-factor(sales$order_year,levels = c(2015,2016,2017,2018))
sales_by_year<-sales %>%
group_by(order_year) %>%
summarise(sales=round(sum(Sales),0))
ggplot(sales_by_year, aes(x = order_year, y = sales)) +
geom_col(fill = "skyblue", alpha = 0.7, width = 0.5, color = "black") +
labs(x = "", y = "Sales", title = "Sales by Month") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma) +
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)
ship_mode_counts <- table(sales$Ship.Mode)
pie(ship_mode_counts, labels = paste0(names(ship_mode_counts), ": ", round(prop.table(ship_mode_counts) * 100,0), "%"))
sales_by_seg <- sales %>%
group_by(Segment) %>%
summarise(sales = sum(Sales))
ggplot(sales_by_seg, aes(x = Segment, y = sales)) +
geom_col(fill = 'skyblue', alpha = 0.8,width = 0.5) +
labs(x = "Segment", y = "Sales") +
#theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)
coord_flip()
## <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
## aspect: function
## backtransform_range: function
## clip: on
## default: FALSE
## distance: function
## expand: TRUE
## is_free: function
## is_linear: function
## labels: function
## limits: list
## modify_scales: function
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## setup_data: function
## setup_layout: function
## setup_panel_guides: function
## setup_panel_params: function
## setup_params: function
## train_panel_guides: function
## transform: function
## super: <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
pie(sales_by_seg$sales, labels = paste0(sales_by_seg$Segment, ": ", round(prop.table(sales_by_seg$sales) * 100, 0), "%"))
suppressMessages(library(ggthemes))
foo <- sales %>%
group_by(State, order_month, order_year) %>%
summarise(sales = sum(Sales))
## `summarise()` has grouped output by 'State', 'order_month'. You can override
## using the `.groups` argument.
top_states <- foo %>%
group_by(State) %>%
summarise(total_sales = sum(sales)) %>%
top_n(3, total_sales) %>%
pull(State)
filtered_data <- foo %>%
filter(State %in% top_states)
ggplot(filtered_data, aes(x = paste(order_month, order_year, sep = "-"), y = sales, group = State, color = State)) +
geom_line() +
theme_tufte() +
labs(x = "Year", y = "Sales", color = "State") +
scale_x_discrete(labels = function(x) gsub("-", " ", x)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
top10_states <- foo %>%
group_by(State) %>%
summarise(total_sales = sum(sales)) %>%
top_n(10, total_sales) %>%
arrange(total_sales)
ggplot(top10_states, aes(x = reorder(State, desc(total_sales)), y = total_sales)) +
geom_bar(stat = "identity", alpha = 0.7, fill = "pink") +
geom_text(aes(label = total_sales), vjust = -0.5, hjust = 1, size = 3, color = "black") +
labs(x = "State", y = "Total Sales") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_flip()+
theme_minimal()
sales_by_region<-sales %>%
group_by(Region) %>%
summarise(sales=round(sum(Sales),0))
PieDonut(sales_by_region,aes(Region,count=sales),
title = "Sales by Region",r0=0.6) +
scale_fill_manual(values = c("skyblue", "grey", "pink", "coral", "purple"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the webr package.
## Please report the issue at <https://github.com/cardiomoon/webr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
sales_by_subcat<-sales %>%
group_by(Category,Sub.Category) %>%
summarise(sale=sum(Sales))
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
PieDonut(sales_by_subcat,
aes(Category,Sub.Category,count=sale),
title = "Sales by Category and Sub-Category")
cat<-sales[,c("Category","Sales")]
catdummy<-dummy_cols(cat)
catdummy<-catdummy %>%
rename(furniture=3, office_supplies=4, technology=5)
catdummy<-catdummy[,2:5]
#convert columns data type to numeric
catdummy$furniture<-as.numeric(catdummy$furniture)
catdummy$office_supplies<-as.numeric(catdummy$office_supplies)
catdummy$technology<-as.numeric(catdummy$technology)
#create a correlation matrix
corcatdummy<-cor(catdummy)
#plotting correlation matrix
corrplot(corcatdummy, type = "upper", method = "color",
addCoef.col = "black",
tl.col = "black", diag = FALSE, order = "hclust")
seg<-sales[,c("Segment","Sales")]
segdummy<-dummy_cols(seg)
head(segdummy)
## Segment Sales Segment_Consumer Segment_Corporate Segment_Home Office
## 1 Consumer 261.9600 1 0 0
## 2 Consumer 731.9400 1 0 0
## 3 Corporate 14.6200 0 1 0
## 4 Consumer 957.5775 1 0 0
## 5 Consumer 22.3680 1 0 0
## 6 Consumer 48.8600 1 0 0
segdummy<-segdummy %>%
rename(consumer=3,corporate=4, home_office=5)
segdummy$consumer<-as.numeric(segdummy$consumer)
segdummy$corporate<-as.numeric(segdummy$corporate)
segdummy$home_office<-as.numeric(segdummy$home_office)
segdummy<-segdummy[,2:5]
corsegdummy<-cor(segdummy)
corrplot(corsegdummy, type = "upper", method = "color",
addCoef.col = "black",
tl.col = "black", diag = FALSE, order = "hclust")
ship<-sales[,c("Ship.Mode","Sales")]
shipdummy<-dummy_cols(ship)
shipdummy<-shipdummy[,2:6] %>%
rename(first_class=2, same_day=3, second_class=4)
shipdummy$first_class<-as.numeric(shipdummy$first_class)
shipdummy$same_day<-as.numeric(shipdummy$same_day)
shipdummy$second_class<-as.numeric(shipdummy$second_class)
corshipdummy<-cor(shipdummy)
corrplot(corshipdummy, type = "upper", method = "color",
addCoef.col = "black",
tl.col = "black", diag = FALSE, order = "hclust")
re<-sales[,c("Region","Sales")]
redummy<-dummy_cols(re)
redummy<-redummy[,2:6] %>%
mutate_all(as.numeric)
corredummy<-cor(redummy)
corrplot(corredummy, type = "upper", method = "color",
addCoef.col = "black",
tl.col = "black", diag = FALSE, order = "hclust")
sales_small <- sales[c("Order.Date", "Sales")]
hist(sales_small$Order.Date, breaks = 18,
main = "Order Date Histogram",
xlab = "Order Date",
ylab = "Count")
foo <- sales %>%
group_by(State, Order.Date, order_month, order_year) %>%
summarise(sales = sum(Sales))
## `summarise()` has grouped output by 'State', 'Order.Date', 'order_month'. You
## can override using the `.groups` argument.
top_states <- foo %>%
group_by(State) %>%
summarise(total_sales = sum(sales)) %>%
top_n(3, total_sales) %>%
pull(State)
df <- foo %>%
filter(State %in% top_states)
dim(df)
## [1] 1455 5
df$Order.Date <- floor_date(df$Order.Date, "month")
df<- df %>%
group_by(State, Order.Date) %>%
summarise(sales = sum(sales))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
ca_df <- subset(df, State == "California", select = c(Order.Date, sales))
ny_df <- subset(df, State == "New York", select = c(Order.Date, sales))
tx_df <- subset(df, State == "Texas", select = c(Order.Date, sales))
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how #
## # base R's lag() function is supposed to work, which breaks lag(my_xts). #
## # #
## # Calls to lag(my_xts) that you enter or source() into this session won't #
## # work correctly. #
## # #
## # All package code is unaffected because it is protected by the R namespace #
## # mechanism. #
## # #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## ################################### WARNING ###################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
ca_xts <- as.xts(ca_df$sales, order.by = ca_df$Order.Date)
ny_xts <- as.xts(ny_df$sales, order.by = ny_df$Order.Date)
tx_xts <- as.xts(tx_df$sales, order.by = tx_df$Order.Date)
plot(ca_xts, main = "CA Sales", ylab = "sales")
plot(ny_xts, main = "NY Sales", ylab = "sales")
plot(tx_xts, main = "TX Sales", ylab = "sales")
ca_train <- ca_xts["/2017-12-01"]
ca_test <- ca_xts["2018-01-01/"]
ny_train <- ny_xts["/2017-12-01"]
ny_test <- ny_xts["2018-01-01/"]
tx_train <- tx_xts["/2017-12-01"]
tx_test <- tx_xts["2018-01-01/"]
acf(ca_train)
acf(ny_train)
acf(tx_train)
suppressMessages(library(tseries))
adf.test(ca_train)
##
## Augmented Dickey-Fuller Test
##
## data: ca_train
## Dickey-Fuller = -2.5812, Lag order = 3, p-value = 0.347
## alternative hypothesis: stationary
kpss.test(ca_train)
##
## KPSS Test for Level Stationarity
##
## data: ca_train
## KPSS Level = 0.57412, Truncation lag parameter = 3, p-value = 0.02499
adf.test(ny_train)
##
## Augmented Dickey-Fuller Test
##
## data: ny_train
## Dickey-Fuller = -3.1615, Lag order = 3, p-value = 0.1212
## alternative hypothesis: stationary
kpss.test(ny_train)
## Warning in kpss.test(ny_train): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: ny_train
## KPSS Level = 0.21396, Truncation lag parameter = 3, p-value = 0.1
adf.test(tx_train)
##
## Augmented Dickey-Fuller Test
##
## data: tx_train
## Dickey-Fuller = -2.8367, Lag order = 3, p-value = 0.2476
## alternative hypothesis: stationary
kpss.test(tx_train)
## Warning in kpss.test(tx_train): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: tx_train
## KPSS Level = 0.066898, Truncation lag parameter = 3, p-value = 0.1
suppressMessages(library(forecast))
ca_base <- auto.arima(ca_train, seasonal = FALSE)
ca_base
## Series: ca_train
## ARIMA(0,1,0)
##
## sigma^2 = 40706014: log likelihood = -356.3
## AIC=714.59 AICc=714.71 BIC=716.15
ny_base <- auto.arima(ny_train, seasonal = FALSE)
ny_base
## Series: ny_train
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 5905.2494
## s.e. 928.9626
##
## sigma^2 = 31954513: log likelihood = -361.61
## AIC=727.22 AICc=727.59 BIC=730.39
tx_base <- auto.arima(tx_train, seasonal = FALSE)
tx_base
## Series: tx_train
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 3476.4104
## s.e. 547.4565
##
## sigma^2 = 11097751: log likelihood = -342.58
## AIC=689.15 AICc=689.51 BIC=692.32
CA
a <- length(ca_test)
forecast_ca_base <- forecast(ca_base, h = a)
plot(forecast_ca_base, main = "Forecasts of CA monthly sales 2018 - baseline model")
residuals_ca <- ca_test - forecast_ca_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 7.4532, df = 3, p-value = 0.05877
##
## Model df: 0. Total lags used: 3
acc_ca_base <- accuracy(forecast_ca_base, ca_test)
acc_ca_base
## ME RMSE MAE MPE MAPE MASE
## Training set 530.7376 6290.89 4743.383 -48.45317 91.80765 0.9722362
## Test set -9503.7180 10403.43 9503.718 -107.21032 107.21032 1.9479468
## ACF1
## Training set -0.3233951
## Test set NA
NY
forecast_ny_base <- forecast(ny_base, h = 12)
plot(forecast_ny_base, main = "Forecasts of NY monthly sales 2018 - baseline model")
acc_ny_base <- accuracy(forecast_ny_base, ny_test)
acc_ny_base
## ME RMSE MAE MPE MAPE MASE
## Training set 1.730665e-12 5573.768 4290.426 -4573.4298 4603.0904 0.7619477
## Test set 1.909098e+03 7630.254 5732.466 -144.6164 189.5209 1.0180433
## ACF1
## Training set 0.02528753
## Test set NA
residuals_ny <- ny_test - forecast_ny_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 5.6298, df = 3, p-value = 0.1311
##
## Model df: 0. Total lags used: 3
TX
forecast_tx_base <- forecast(tx_base, h = 12)
plot(forecast_tx_base, main = "Forecasts of TX monthly sales 2018 - baseline model")
acc_tx_base <- accuracy(forecast_tx_base, tx_test)
acc_tx_base
## ME RMSE MAE MPE MAPE MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.5260 315.97409 0.6890568
## Test set 1.420695e+02 2171.060 1540.734 -50.1806 73.59972 0.5167082
## ACF1
## Training set -0.08386677
## Test set NA
residuals_tx <- tx_test - forecast_tx_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
##
## Model df: 0. Total lags used: 3
CA
ca_SA <- auto.arima(ts(ca_train, frequency=12), seasonal = TRUE)
summary(ca_SA)
## Series: ts(ca_train, frequency = 12)
## ARIMA(0,0,0)(1,1,0)[12] with drift
##
## Coefficients:
## sar1 drift
## -0.5587 128.7082
## s.e. 0.1987 58.9000
##
## sigma^2 = 20386913: log likelihood = -237.22
## AIC=480.44 AICc=481.64 BIC=483.97
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -239.1278 3529.684 2213.39 -16.77474 31.89948 0.5269297 -0.1784056
forecast_ca_SA <- forecast(ca_SA, h = 12)
acc_ca_SA <- accuracy(forecast_ca_SA, ca_test)
acc_ca_SA
## ME RMSE MAE MPE MAPE MASE
## Training set -239.1278 3529.684 2213.390 -16.774740 31.89948 0.4536715
## Test set 1008.1372 3058.372 2559.622 4.742513 21.85629 0.5246376
## ACF1
## Training set -0.1784056
## Test set NA
plot(forecast_ca_SA, main = "Forecasts of CA monthly sales 2018 - SARIMA model")
residuals_ca1 <- ca_test - forecast_ca_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca1)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1.4383, df = 3, p-value = 0.6966
##
## Model df: 0. Total lags used: 3
NY
ny_SA <- auto.arima(ts(ny_train, frequency=12), seasonal = TRUE)
summary(ny_SA)
## Series: ts(ny_train, frequency = 12)
## ARIMA(0,0,0)(0,1,0)[12]
##
## sigma^2 = 16420566: log likelihood = -233.42
## AIC=468.85 AICc=469.03 BIC=470.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 181.801 3308.632 1873.802 2.31772 45.75619 0.6673032 0.03047806
forecast_ny_SA <- forecast(ny_SA, h = 12)
acc_ny_SA <- accuracy(forecast_ny_SA, ny_test)
acc_ny_SA
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 181.801 3308.632 1873.802 2.31772 45.75619 0.3327733 0.03047806
## Test set 1912.134 7451.183 5124.712 -43.14743 84.30004 0.9101108 NA
plot(forecast_ny_SA, main = "Forecasts of NY monthly sales 2018 - SARIMA model")
residuals_ny1 <- ny_test - forecast_ny_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny1)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1.8541, df = 3, p-value = 0.6032
##
## Model df: 0. Total lags used: 3
TX
tx_SA <- auto.arima(ts(tx_train, frequency=12), seasonal = TRUE)
summary(tx_SA)
## Series: ts(tx_train, frequency = 12)
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 3476.4104
## s.e. 547.4565
##
## sigma^2 = 11097751: log likelihood = -342.58
## AIC=689.15 AICc=689.51 BIC=692.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.526 315.9741 0.8631626
## ACF1
## Training set -0.08386677
forecast_tx_SA <- forecast(tx_SA, h = 12)
acc_tx_SA <- accuracy(forecast_tx_SA, tx_test)
acc_tx_SA
## ME RMSE MAE MPE MAPE MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.5260 315.97409 0.6890568
## Test set 1.420695e+02 2171.060 1540.734 -50.1806 73.59972 0.5167082
## ACF1
## Training set -0.08386677
## Test set NA
plot(forecast_tx_SA, main = "Forecasts of TX monthly sales 2018 - SARIMA model")
residuals_tx1 <- tx_test - forecast_tx_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx1)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
##
## Model df: 0. Total lags used: 3
CA
HW.ca <- HoltWinters(ts(ca_train, frequency=12))
HW.ca
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = ts(ca_train, frequency = 12))
##
## Smoothing parameters:
## alpha: 0.04188232
## beta : 1
## gamma: 0.5762624
##
## Coefficients:
## [,1]
## a 12756.7598
## b 725.3795
## s1 -2804.7811
## s2 -4604.6022
## s3 4418.2333
## s4 -258.1442
## s5 -1135.5021
## s6 2491.7567
## s7 503.9543
## s8 -244.7041
## s9 489.4752
## s10 -2298.6156
## s11 4900.7352
## s12 8772.0362
forecast_HW.ca <- forecast(HW.ca, h = 12)
acc_HW.ca <- accuracy(forecast_HW.ca, ca_test)
acc_HW.ca
## ME RMSE MAE MPE MAPE MASE
## Training set 772.0022 4924.683 3278.289 -5.257923 45.60812 0.6719405
## Test set -6268.6492 7123.830 6268.649 -59.193624 59.19362 1.2848651
## ACF1
## Training set -0.2178956
## Test set NA
plot(forecast_HW.ca, main = "Forecasts of CA monthly sales 2018 - HW model")
residuals_ca2 <- ca_test - forecast_HW.ca$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca2)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 3.8892, df = 3, p-value = 0.2737
##
## Model df: 0. Total lags used: 3
NY
HW.ny <- HoltWinters(ts(ny_train, frequency=12))
HW.ny
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = ts(ny_train, frequency = 12))
##
## Smoothing parameters:
## alpha: 0.1652065
## beta : 0
## gamma: 1
##
## Coefficients:
## [,1]
## a 5767.9254
## b 137.6698
## s1 -5426.9718
## s2 -4627.4806
## s3 204.2712
## s4 -3007.2745
## s5 -1559.6880
## s6 -1928.5106
## s7 -4678.6197
## s8 -2970.9045
## s9 4191.1736
## s10 -2322.8889
## s11 1873.7533
## s12 7442.5966
forecast_HW.ny <- forecast(HW.ny, h = 12)
acc_HW.ny <- accuracy(forecast_HW.ny, ny_test)
acc_HW.ny
## ME RMSE MAE MPE MAPE MASE
## Training set -639.4068 3888.855 2584.666 -9.857669 51.36922 0.4590175
## Test set 2219.1132 7120.570 4695.783 -12.867198 63.60048 0.8339361
## ACF1
## Training set -0.1772572
## Test set NA
plot(forecast_HW.ny, main = "Forecasts of NY monthly sales 2018 - HW model")
residuals_ny2 <- ny_test - forecast_HW.ny$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny2)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1.556, df = 3, p-value = 0.6694
##
## Model df: 0. Total lags used: 3
TX
HW.tx <- HoltWinters(ts(tx_train, frequency=12))
HW.tx
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = ts(tx_train, frequency = 12))
##
## Smoothing parameters:
## alpha: 0
## beta : 0
## gamma: 0.6406015
##
## Coefficients:
## [,1]
## a 1122.8424
## b -158.5682
## s1 -2513.6859
## s2 -2898.7952
## s3 2415.6664
## s4 1218.0931
## s5 -189.1532
## s6 -1453.9943
## s7 399.4519
## s8 1606.1684
## s9 4826.0672
## s10 -273.5921
## s11 3119.0982
## s12 1737.5450
forecast_HW.tx <- forecast(HW.tx, h = 12)
acc_HW.tx <- accuracy(forecast_HW.tx, tx_test)
acc_HW.tx
## ME RMSE MAE MPE MAPE MASE
## Training set 519.8805 4025.533 2456.531 -14.16475 123.2660 0.8238346
## Test set 2860.2586 3575.189 3137.229 73.46266 104.6471 1.0521170
## ACF1
## Training set -0.2082932
## Test set NA
plot(forecast_HW.tx, main = "Forecasts of TX monthly sales 2018 - HW model")
residuals_tx2 <- tx_test - forecast_HW.tx$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx2)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 2.6453, df = 3, p-value = 0.4496
##
## Model df: 0. Total lags used: 3
CA
m41 <- forecast::arfima(ts(ca_train))
summary(m41)
##
## Call:
## forecast::arfima(y = ts(ca_train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## d 1.179e-01 3.762e-06 31336 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 5110.637
## [d.tol = 0.0001221, M = 100, h = 3.788e-06]
## Log likelihood: -358.5 ==> AIC = 721.0526 [2 deg.freedom]
pred_41 <- forecast(m41, h = length(ca_test))
resid_41 <- ca_test - pred_41$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_41)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 7.7712, df = 3, p-value = 0.05099
##
## Model df: 0. Total lags used: 3
mean(resid_41)
## [1] 2985.732
NY
m42 <- forecast::arfima(ts(ny_train))
summary(m42)
##
## Call:
## forecast::arfima(y = ts(ny_train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## d 3.517e-03 3.794e-06 927.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 5573.727
## [d.tol = 0.0001221, M = 100, h = 3.821e-06]
## Log likelihood: -361.6 ==> AIC = 727.2199 [2 deg.freedom]
pred_42 <- forecast(m42, h = length(ny_test))
resid_42 <- ny_test - pred_42$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_42)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 5.6349, df = 3, p-value = 0.1308
##
## Model df: 0. Total lags used: 3
mean(resid_42)
## [1] 1901.498
TX
m43 <- forecast::arfima(ts(tx_train))
summary(m43)
##
## Call:
## forecast::arfima(y = ts(tx_train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## d 4.583e-05 3.595e-06 12.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 3284.751
## [d.tol = 0.0001221, M = 100, h = 3.62e-06]
## Log likelihood: -342.6 ==> AIC = 689.1481 [2 deg.freedom]
pred_43 <- forecast(m43, h = length(tx_test))
resid_43 <- tx_test - pred_43$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_43)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
##
## Model df: 0. Total lags used: 3
mean(resid_43)
## [1] 142.0573
agg_sales <- sales %>%
group_by(Order.Date) %>%
summarise(sales = sum(Sales))
drange <- seq(as.Date("2015-01-03"), as.Date("2018-12-30"), by="day")
dall <- data.frame(Order.Date = drange)
dall <- merge(agg_sales, dall, by=c("Order.Date"), all.y=TRUE)
dim(dall[is.na(dall$sales),])
## [1] 228 2
dim(dall)
## [1] 1458 2
sales_xts <- as.xts(dall$sales, order.by = dall$Order.Date)
sales_xts <- na.fill(sales_xts, 0)
sum(is.na(sales_xts))
## [1] 0
plot(sales_xts, main = "Total Sales by Date", ylab = "sales")
train <- sales_xts["/2018-11-30"]
test <- sales_xts["2018-12-01/"]
dim(train)
## [1] 1428 1
dim(test)
## [1] 30 1
acf(train)
adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train
## Dickey-Fuller = -8.3022, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train)
## Warning in kpss.test(train): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: train
## KPSS Level = 2.3848, Truncation lag parameter = 7, p-value = 0.01
base <- auto.arima(train, seasonal = FALSE)
base
## Series: train
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.9558
## s.e. 0.0099
##
## sigma^2 = 4593764: log likelihood = -12970.79
## AIC=25945.58 AICc=25945.59 BIC=25956.11
a <- length(test)
forecast_base <- forecast(base, h = a)
plot(forecast_base, main = "Forecasts of Daily sales 2018 - baseline model")
model.1 <- auto.arima(train, seasonal = TRUE)
summary(model.1)
## Series: train
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.9558
## s.e. 0.0099
##
## sigma^2 = 4593764: log likelihood = -12970.79
## AIC=25945.58 AICc=25945.59 BIC=25956.11
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 46.6345 2141.805 1380.525 -Inf Inf 0.7746933 0.03413206
m1.forecast <- forecast(model.1, h=a)
plot(m1.forecast, main = "Forecasts of daily sales 2018 - sarima")
resid_m1 <- residuals(model.1)
mean(resid_m1)
## [1] 46.6345
checkresiduals(resid_m1)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 29.71, df = 10, p-value = 0.0009555
##
## Model df: 0. Total lags used: 10
hw <- HoltWinters(train, gamma = FALSE, seasonal = 'additive')
hw
## Holt-Winters exponential smoothing with trend and without seasonal component.
##
## Call:
## HoltWinters(x = train, gamma = FALSE, seasonal = "additive")
##
## Smoothing parameters:
## alpha: 0.103466
## beta : 0.02833581
## gamma: FALSE
##
## Coefficients:
## [,1]
## a 3956.2250
## b 22.1407
hw_forecast <- forecast(hw, h = length(test))
resid_hw <- test - hw_forecast$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
plot(hw_forecast, main = "Forecasts of daily sales 2018 - HoltWinters")
plot(hw_forecast$mean, main = "Forecasts of daily sales 2018(zoom in) - HoltWinters")
checkresiduals(resid_hw)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 5.5824, df = 6, p-value = 0.4716
##
## Model df: 0. Total lags used: 6
m4 <- forecast::arfima(ts(train))
AIC(m4)
## [1] 25958.21
summary(m4)
##
## Call:
## forecast::arfima(y = ts(train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## d 0.38048 0.05255 7.241 4.47e-13 ***
## ar.ar1 0.40244 0.06000 6.707 1.98e-11 ***
## ma.ma1 0.70450 0.06126 11.500 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 2136.335
## [d.tol = 0.0001221, M = 100, h = 0.0001367]
## Log likelihood: -1.298e+04 ==> AIC = 25958.21 [4 deg.freedom]
pred_ARFIMA <- forecast(m4, h = length(test))
resid_arf <- test - pred_ARFIMA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_arf)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 6.5911, df = 6, p-value = 0.3603
##
## Model df: 0. Total lags used: 6
mean(resid_arf)
## [1] 0.6193146
plot(pred_ARFIMA, main = "Forecasts of daily sales 2018 - ARFIMA")
plot(pred_ARFIMA$mean, main = "Forecasts of daily sales 2018(zoom in) - ARFIMA")